This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#install.packages("fastDummies")
#install.packages("randomForest")
library(ggplot2)
library(reshape2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dplyr)
library(lubridate)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
trends_byage <- read.csv("people/trends-byage.csv")
trends_byrace <- read.csv("people/trends-byrace.csv")
trends_byboro <- read.csv("people/trends-byboro.csv")
coverage_by_demo <- read.csv("people/coverage-by-demo.csv")
trends_byage$DATE <- as.Date(trends_byage$DATE)
trends_byage <- filter(trends_byage, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-08-31"))
trends_long <- melt(trends_byage, id.vars = "DATE",
measure.vars = grep("COUNT_ADDITIONAL_CUMULATIVE", names(trends_byage), value = TRUE))
names(trends_long)[names(trends_long) == "variable"] <- "Age_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))
ggplot(trends_long, aes(x = DATE, y = Cumulative_Count, color = Age_Group)) +
geom_line() +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
scale_y_log10() +
labs(title = "Trends of Additional/Booster Doses by Age Group (Aug 2021 - Aug 2022)",
x = "Date",
y = "Cumulative Count of Additional Doses (Log Scale)") +
theme_minimal() +
theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1))
age_group_columns <- c("COUNT_ADDITIONAL_CUMULATIVE_0_4", "COUNT_ADDITIONAL_CUMULATIVE_5_12",
"COUNT_ADDITIONAL_CUMULATIVE_13_17", "COUNT_ADDITIONAL_CUMULATIVE_18_24",
"COUNT_ADDITIONAL_CUMULATIVE_25_34", "COUNT_ADDITIONAL_CUMULATIVE_35_44",
"COUNT_ADDITIONAL_CUMULATIVE_45_54", "COUNT_ADDITIONAL_CUMULATIVE_55_64",
"COUNT_ADDITIONAL_CUMULATIVE_65_74", "COUNT_ADDITIONAL_CUMULATIVE_75up")
trends_byage[age_group_columns] <- lapply(trends_byage[age_group_columns], function(x) as.numeric(as.character(x)))
trends_byage[is.na(trends_byage)] <- 0
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_0_17 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_0_4 +
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_5_12 +
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_13_17
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_18_44 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_18_24 +
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_25_34 +
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_35_44
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_45_64 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_45_54 +
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_55_64
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_65_74 <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_65_74
trends_byage$COUNT_ADDITIONAL_CUMULATIVE_75up <- trends_byage$COUNT_ADDITIONAL_CUMULATIVE_75up
trends_byage$DATE <- as.Date(trends_byage$DATE, format = "%Y-%m-%d")
age_groups <- c("COUNT_ADDITIONAL_CUMULATIVE_0_17", "COUNT_ADDITIONAL_CUMULATIVE_18_44",
"COUNT_ADDITIONAL_CUMULATIVE_45_64", "COUNT_ADDITIONAL_CUMULATIVE_65_74",
"COUNT_ADDITIONAL_CUMULATIVE_75up")
trends_byage <- filter(trends_byage, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-05-31"))
trends_long <- melt(trends_byage, id.vars = "DATE",
measure.vars = age_groups)
names(trends_long)[names(trends_long) == "variable"] <- "Age_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))
ggplot(trends_long, aes(x = DATE, y = Cumulative_Count, color = Age_Group)) +
geom_line() +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
scale_y_log10() +
labs(title = "Trends of Additional Doses by Age Group",
x = "Date",
y = "Cumulative Count of Additional Doses (Log Scale)") +
theme_minimal() +
theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1))
trends_byrace$DATE <- as.Date(trends_byrace$DATE, format = "%Y-%m-%d")
trends_byrace <- filter(trends_byrace, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-05-31"))
race_groups <- c("COUNT_ADDITIONAL_CUMULATIVE_AIAN", "COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI",
"COUNT_ADDITIONAL_CUMULATIVE_BLACK", "COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO",
"COUNT_ADDITIONAL_CUMULATIVE_WHITE", "COUNT_ADDITIONAL_CUMULATIVE_OTHER")
trends_long <- melt(trends_byrace, id.vars = "DATE",
measure.vars = race_groups)
names(trends_long)[names(trends_long) == "variable"] <- "Race_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))
ggplot(trends_long, aes(x = DATE, y = Cumulative_Count, color = Race_Group)) +
geom_line() +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
scale_y_log10() +
labs(title = "Trends of Additional Doses by Race Group from August 2021",
x = "Date",
y = "Cumulative Count of Additional Doses") +
theme_minimal() +
theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1))
Data Preprocessing
age_group_columns <- c("COUNT_ADDITIONAL_CUMULATIVE_5_12",
"COUNT_ADDITIONAL_CUMULATIVE_13_17", "COUNT_ADDITIONAL_CUMULATIVE_18_24",
"COUNT_ADDITIONAL_CUMULATIVE_25_34", "COUNT_ADDITIONAL_CUMULATIVE_35_44",
"COUNT_ADDITIONAL_CUMULATIVE_45_54", "COUNT_ADDITIONAL_CUMULATIVE_55_64",
"COUNT_ADDITIONAL_CUMULATIVE_65_74", "COUNT_ADDITIONAL_CUMULATIVE_75up")
trends_byage$DATE <- as.Date(trends_byage$DATE)
trends_byage <- filter(trends_byage, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-08-31"))
trends_long <- melt(trends_byage, id.vars = "DATE",
measure.vars = grep("COUNT_ADDITIONAL_CUMULATIVE", names(trends_byage), value = TRUE))
names(trends_long)[names(trends_long) == "variable"] <- "Age_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))
start_year <- 2020
start_month <- 1
for(age_group in age_group_columns) {
group_data <- filter(trends_long, Age_Group == age_group)
ts_data <- ts(group_data$Cumulative_Count, start=c(start_year, start_month), frequency=12)
model <- auto.arima(ts_data)
residuals_data <- residuals(model)
Acf(residuals_data, main = paste("ACF of Residuals for", age_group))
}
Age Forecasting Model
start_year = 2021
start_month = 8
forecasts = list()
errors = list()
latest_data_year = 2022
latest_data_month = 5
future_periods = (2025 - latest_data_year) * 12 + (12 - latest_data_month)
for(age_group in age_group_columns) {
group_data <- filter(trends_long, Age_Group == age_group)
ts_data <- ts(group_data$Cumulative_Count, start=c(start_year, start_month), frequency=12)
error <- tsCV(ts_data, forecastfunction = function(x) forecast(auto.arima(x), h=1), h=1)
errors[[age_group]] <- error
model <- auto.arima(ts_data)
forecast_data <- forecast(model, h=future_periods)
forecasts[[age_group]] <- forecast_data
}
Age Forecasting Result
for(i in 1:length(forecasts)) {
age_group <- names(forecasts)[i]
forecast_data <- forecasts[[age_group]]
historical_data <- filter(trends_long, Age_Group == age_group) %>%
select(Date = DATE, Count = Cumulative_Count) %>%
mutate(Type = "Historical")
forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1),
length.out = length(forecast_data$mean),
by = "month")
forecast_df <- data.frame(Date = forecast_dates,
Count = forecast_data$mean,
Type = "Forecasted")
combined_df <- rbind(historical_data, forecast_df)
p <- ggplot(combined_df, aes(x = Date, y = Count, color = Type)) +
geom_line() +
labs(title = paste("Forecast for Age Group", gsub("COUNT_ADDITIONAL_CUMULATIVE_", "", age_group), "(2021 - 2025)"),
x = "Date",
y = "Cumulative Count") +
theme_minimal() +
theme(legend.title = element_blank())
print(p)
}
head(combined_df)
## Date Count Type
## 1 2021-08-01 7550 Historical
## 2 2021-08-02 7566 Historical
## 3 2021-08-03 7578 Historical
## 4 2021-08-04 7603 Historical
## 5 2021-08-05 7629 Historical
## 6 2021-08-06 7637 Historical
tail(combined_df)
## Date Count Type
## 342 2025-07-01 300252.4 Forecasted
## 343 2025-08-01 300350.9 Forecasted
## 344 2025-09-01 300440.4 Forecasted
## 345 2025-10-01 300538.4 Forecasted
## 346 2025-11-01 300644.6 Forecasted
## 347 2025-12-01 300745.6 Forecasted
std_dev_combined_df <- sd(combined_df$Count)
print(std_dev_combined_df)
## [1] 111226.3
Age RMSE
rmse_results <- list()
for(i in 1:length(forecasts)) {
age_group <- names(forecasts)[i]
forecast_data <- forecasts[[age_group]]
historical_data <- filter(trends_long, Age_Group == age_group) %>%
select(Date = DATE, Count = Cumulative_Count) %>%
mutate(Type = "Historical")
forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1),
length.out = length(forecast_data$mean),
by = "month")
forecast_df <- data.frame(Date = forecast_dates,
Count = forecast_data$mean,
Type = "Forecasted")
end_date_for_actuals <- as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) - 1
actual_values <- historical_data$Count[historical_data$Date <= end_date_for_actuals]
if (length(actual_values) >= length(forecast_data$mean)) {
actual_values <- tail(actual_values, length(forecast_data$mean))
forecasted_values <- forecast_data$mean
rmse <- sqrt(mean((actual_values - forecasted_values)^2, na.rm = TRUE))
rmse_results[[age_group]] <- rmse
} else {
rmse_results[[age_group]] <- NA
}
}
rmse_results
## $COUNT_ADDITIONAL_CUMULATIVE_5_12
## [1] 15321.36
##
## $COUNT_ADDITIONAL_CUMULATIVE_13_17
## [1] 11232.03
##
## $COUNT_ADDITIONAL_CUMULATIVE_18_24
## [1] 12890.49
##
## $COUNT_ADDITIONAL_CUMULATIVE_25_34
## [1] 20305.56
##
## $COUNT_ADDITIONAL_CUMULATIVE_35_44
## [1] 15702.67
##
## $COUNT_ADDITIONAL_CUMULATIVE_45_54
## [1] 16352.19
##
## $COUNT_ADDITIONAL_CUMULATIVE_55_64
## [1] 18716.3
##
## $COUNT_ADDITIONAL_CUMULATIVE_65_74
## [1] 13180.44
##
## $COUNT_ADDITIONAL_CUMULATIVE_75up
## [1] 9971.597
Age Moving Rate
slopes <- list()
for(i in 1:length(forecasts)) {
age_group <- names(forecasts)[i]
forecast_data <- forecasts[[age_group]]
forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1),
length.out = length(forecast_data$mean),
by = "month")
forecast_df <- data.frame(Date = forecast_dates,
Count = forecast_data$mean)
forecast_df$Time <- as.numeric(forecast_df$Date - min(forecast_df$Date))
lm_model <- lm(Count ~ Time, data = forecast_df)
slope <- coef(lm_model)["Time"]
slopes[[age_group]] <- slope
}
slopes
## $COUNT_ADDITIONAL_CUMULATIVE_5_12
## Time
## 10.61337
##
## $COUNT_ADDITIONAL_CUMULATIVE_13_17
## Time
## 4.739182
##
## $COUNT_ADDITIONAL_CUMULATIVE_18_24
## Time
## 4.496523
##
## $COUNT_ADDITIONAL_CUMULATIVE_25_34
## Time
## 7.108529
##
## $COUNT_ADDITIONAL_CUMULATIVE_35_44
## Time
## 4.63574
##
## $COUNT_ADDITIONAL_CUMULATIVE_45_54
## Time
## 4.275551
##
## $COUNT_ADDITIONAL_CUMULATIVE_55_64
## Time
## 5.032212
##
## $COUNT_ADDITIONAL_CUMULATIVE_65_74
## Time
## 3.915015
##
## $COUNT_ADDITIONAL_CUMULATIVE_75up
## Time
## 3.280791
Age Moving Rate Visualization
slopes_df <- data.frame(
AgeGroup = names(slopes),
Slope = unlist(slopes)
)
ggplot(slopes_df, aes(x = AgeGroup, y = Slope, group = 1)) +
geom_line() +
geom_point() +
labs(title = "Slope of Forecasted Trends for Each Age Group",
x = "Age Group",
y = "Potential Willingness") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Race Forecasting Model
trends_byrace$DATE <- as.Date(trends_byrace$DATE, format = "%Y-%m-%d")
trends_byrace <- filter(trends_byrace, DATE >= as.Date("2021-08-01") & DATE <= as.Date("2022-05-31"))
race_groups <- c("COUNT_ADDITIONAL_CUMULATIVE_AIAN", "COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI",
"COUNT_ADDITIONAL_CUMULATIVE_BLACK", "COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO",
"COUNT_ADDITIONAL_CUMULATIVE_WHITE", "COUNT_ADDITIONAL_CUMULATIVE_OTHER")
trends_long <- melt(trends_byrace, id.vars = "DATE",
measure.vars = race_groups)
names(trends_long)[names(trends_long) == "variable"] <- "Race_Group"
names(trends_long)[names(trends_long) == "value"] <- "Cumulative_Count"
trends_long$Cumulative_Count <- as.numeric(as.character(trends_long$Cumulative_Count))
for(race_group in race_groups) {
ts_data <- ts(filter(trends_long, Race_Group == race_group)$Cumulative_Count,
start=c(start_year, start_month), frequency=12)
model <- auto.arima(ts_data)
residuals_data <- residuals(model)
acf_plot <- Acf(residuals_data, main = paste("ACF of Residuals for", race_group))
# ggsave(paste("ACF_plot_", race_group, ".png", sep = ""), plot = acf_plot, width = 7, height = 5)
print(acf_plot)
}
##
## Autocorrelations of series 'residuals_data', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.018 -0.047 0.125 0.022 -0.198 -0.048 0.335 -0.107 -0.157 0.020
## 11 12 13 14 15 16 17 18 19 20 21
## -0.009 -0.019 -0.020 0.348 0.010 -0.125 0.122 0.086 -0.085 0.030 0.375
## 22 23 24
## -0.075 -0.198 0.007
##
## Autocorrelations of series 'residuals_data', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.010 0.017 -0.016 0.013 0.050 -0.066 0.154 -0.102 0.030 -0.027
## 11 12 13 14 15 16 17 18 19 20 21
## -0.033 0.005 -0.102 -0.039 0.017 0.075 0.060 -0.055 0.113 0.034 0.129
## 22 23 24
## -0.031 0.010 0.023
##
## Autocorrelations of series 'residuals_data', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.043 0.037 0.031 0.006 0.145 -0.216 0.136 -0.144 0.018 -0.046
## 11 12 13 14 15 16 17 18 19 20 21
## -0.098 0.004 -0.087 0.199 -0.117 0.172 -0.018 -0.056 0.165 -0.093 0.223
## 22 23 24
## -0.177 0.031 0.012
##
## Autocorrelations of series 'residuals_data', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.026 -0.044 0.114 0.058 -0.193 -0.160 0.460 -0.205 -0.183 0.103
## 11 12 13 14 15 16 17 18 19 20 21
## 0.007 -0.016 -0.077 0.455 -0.120 -0.025 0.123 0.033 -0.027 -0.078 0.353
## 22 23 24
## -0.155 -0.130 0.006
##
## Autocorrelations of series 'residuals_data', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.011 0.063 -0.152 -0.260 0.044 0.032 0.535 -0.082 0.030 -0.157
## 11 12 13 14 15 16 17 18 19 20 21
## -0.277 0.105 0.034 0.456 0.002 0.036 -0.177 -0.203 0.111 -0.010 0.450
## 22 23 24
## -0.022 -0.050 -0.154
##
## Autocorrelations of series 'residuals_data', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.034 0.031 0.015 -0.005 0.129 -0.185 0.160 -0.095 -0.003 0.046
## 11 12 13 14 15 16 17 18 19 20 21
## -0.057 0.010 -0.012 0.125 -0.072 0.174 -0.014 -0.030 0.134 -0.024 0.175
## 22 23 24
## -0.219 0.053 0.060
start_year = 2021
start_month = 8
latest_data_year = 2022
latest_data_month = 5
future_periods = (2025 - latest_data_year) * 12 + (12 - latest_data_month)
forecasts = list()
errors = list()
for(race_group in race_groups) {
ts_data <- ts(filter(trends_long, Race_Group == race_group)$Cumulative_Count,
start=c(start_year, start_month), frequency=12)
error <- tsCV(ts_data, forecastfunction = function(x) forecast(auto.arima(x), h=1), h=1)
errors[[race_group]] <- error
model <- auto.arima(ts_data)
forecast_data <- forecast(model, h=future_periods)
forecasts[[race_group]] <- forecast_data
}
Race Forecasting Result
for(i in 1:length(forecasts)) {
race_group <- names(forecasts)[i]
forecast_data <- forecasts[[i]]
historical_data <- filter(trends_long, Race_Group == race_group) %>%
select(Date = DATE, Count = Cumulative_Count) %>%
mutate(Type = "Historical")
forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1),
length.out = length(forecast_data$mean),
by = "month")
forecast_df <- data.frame(Date = forecast_dates,
Count = forecast_data$mean,
Type = "Forecasted")
if (all(names(historical_data) == names(forecast_df))) {
combined_df <- rbind(historical_data, forecast_df)
p <- ggplot(combined_df, aes(x = Date, y = Count, color = Type)) +
geom_line() +
labs(title = paste("Forecast for Race Group", gsub("COUNT_ADDITIONAL_CUMULATIVE_", "", race_group), "(2020 - 2025)"),
x = "Date",
y = "Cumulative Count") +
theme_minimal() +
theme(legend.title = element_blank())
print(p)
} else {
print(paste("Column mismatch in data frames for race group:", race_group))
}
}
Race RMSE Result
rmse_results <- list()
for(i in 1:length(forecasts)) {
race_group <- names(forecasts)[i]
forecast_data <- forecasts[[i]]
historical_data <- filter(trends_long, Race_Group == race_group) %>%
select(Date = DATE, Count = Cumulative_Count) %>%
mutate(Type = "Historical")
forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1),
length.out = length(forecast_data$mean),
by = "month")
forecast_df <- data.frame(Date = forecast_dates,
Count = forecast_data$mean,
Type = "Forecasted")
if (all(names(historical_data) == names(forecast_df))) {
combined_df <- rbind(historical_data, forecast_df)
end_date_for_actuals <- as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) - 1
actual_values <- historical_data$Count[historical_data$Date <= end_date_for_actuals]
if (length(actual_values) >= length(forecast_data$mean)) {
actual_values <- tail(actual_values, length(forecast_data$mean))
forecasted_values <- forecast_data$mean
rmse <- sqrt(mean((actual_values - forecasted_values)^2, na.rm = TRUE)) # na.rm = TRUE to handle missing values
rmse_results[[race_group]] <- rmse
}
}
}
rmse_results
## $COUNT_ADDITIONAL_CUMULATIVE_AIAN
## [1] 569.5745
##
## $COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI
## [1] 29400.16
##
## $COUNT_ADDITIONAL_CUMULATIVE_BLACK
## [1] 26319.53
##
## $COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO
## [1] 35724.15
##
## $COUNT_ADDITIONAL_CUMULATIVE_WHITE
## [1] 35481.1
##
## $COUNT_ADDITIONAL_CUMULATIVE_OTHER
## [1] 4678.765
Age Moving Rate
slopes <- list()
for(i in 1:length(forecasts)) {
group_name <- names(forecasts)[i]
forecast_data <- forecasts[[group_name]]
forecast_dates <- seq(from = as.Date(paste(latest_data_year, latest_data_month, "01", sep="-")) %m+% months(1),
length.out = length(forecast_data$mean),
by = "month")
forecast_df <- data.frame(Date = forecast_dates,
Count = forecast_data$mean)
forecast_df$Time <- as.numeric(forecast_df$Date - min(forecast_df$Date))
lm_model <- lm(Count ~ Time, data = forecast_df)
slope <- coef(lm_model)["Time"]
slopes[[group_name]] <- slope
}
slopes
## $COUNT_ADDITIONAL_CUMULATIVE_AIAN
## Time
## 0.2563122
##
## $COUNT_ADDITIONAL_CUMULATIVE_ASIAN_NHPI
## Time
## 12.75541
##
## $COUNT_ADDITIONAL_CUMULATIVE_BLACK
## Time
## 8.413805
##
## $COUNT_ADDITIONAL_CUMULATIVE_HISP_LATINO
## Time
## 13.68999
##
## $COUNT_ADDITIONAL_CUMULATIVE_WHITE
## Time
## 16.36981
##
## $COUNT_ADDITIONAL_CUMULATIVE_OTHER
## Time
## 1.964457
Race Moving Rate Visualization
slopes_df <- data.frame(
RaceGroup = names(slopes),
Slope = unlist(slopes)
)
ggplot(slopes_df, aes(x = RaceGroup, y = Slope, group = 1)) +
geom_line() +
geom_point() +
labs(title = "Slope of Forecasted Trends for Each Race Group",
x = "Race Group",
y = "Slope Value") + # Adjust the y-axis label if needed
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
coverage_by_demo <- read.csv("people/coverage-by-demo.csv")
head(coverage_by_demo)
## DATE POPULATION GROUP SUBGROUP POP_DENOMINATOR
## 1 2023-09-14 Children (0-17) Age '0-4 523718.0
## 2 2023-09-14 Children (0-17) Age '5-12 747558.7
## 3 2023-09-14 Children (0-17) Age '13-17 432845.5
## 4 2023-09-14 Children (0-17) Borough Citywide 1704122.2
## 5 2023-09-14 Children (0-17) Borough Bronx 347020.4
## 6 2023-09-14 Children (0-17) Borough Brooklyn 575063.2
## COUNT_PARTIALLY_CUMULATIVE COUNT_FULLY_CUMULATIVE COUNT_1PLUS_CUMULATIVE
## 1 23733 41832 65565
## 2 57152 385370 442522
## 3 45273 361209 406482
## 4 126158 788411 914569
## 5 26828 151948 178776
## 6 30057 221873 251930
## COUNT_ADDITIONAL_CUMULATIVE COUNT_BIVALENT_ADDITIONAL_CUMULATIVE
## 1 212 18299
## 2 68382 55365
## 3 125097 38535
## 4 193691 112199
## 5 27368 16763
## 6 54083 32133
## PERC_PARTIALLY PERC_FULLY PERC_1PLUS PERC_ADDITIONAL PERC_BIVALENT_ADDITIONAL
## 1 4.53 7.99 12.52 0.04 3.49
## 2 7.65 51.55 59.20 9.15 7.41
## 3 10.46 83.45 93.91 28.90 8.90
## 4 7.40 46.26 53.67 11.37 6.58
## 5 7.73 43.79 51.52 7.89 4.83
## 6 5.23 38.58 43.81 9.40 5.59
names(coverage_by_demo)
## [1] "DATE"
## [2] "POPULATION"
## [3] "GROUP"
## [4] "SUBGROUP"
## [5] "POP_DENOMINATOR"
## [6] "COUNT_PARTIALLY_CUMULATIVE"
## [7] "COUNT_FULLY_CUMULATIVE"
## [8] "COUNT_1PLUS_CUMULATIVE"
## [9] "COUNT_ADDITIONAL_CUMULATIVE"
## [10] "COUNT_BIVALENT_ADDITIONAL_CUMULATIVE"
## [11] "PERC_PARTIALLY"
## [12] "PERC_FULLY"
## [13] "PERC_1PLUS"
## [14] "PERC_ADDITIONAL"
## [15] "PERC_BIVALENT_ADDITIONAL"
new_dataset <- coverage_by_demo %>% select(COUNT_ADDITIONAL_CUMULATIVE, SUBGROUP, GROUP, POP_DENOMINATOR)
head(new_dataset)
## COUNT_ADDITIONAL_CUMULATIVE SUBGROUP GROUP POP_DENOMINATOR
## 1 212 '0-4 Age 523718.0
## 2 68382 '5-12 Age 747558.7
## 3 125097 '13-17 Age 432845.5
## 4 193691 Citywide Borough 1704122.2
## 5 27368 Bronx Borough 347020.4
## 6 54083 Brooklyn Borough 575063.2
sorted_dataset <- new_dataset %>% arrange(SUBGROUP)
sorted_dataset
## COUNT_ADDITIONAL_CUMULATIVE SUBGROUP GROUP
## 1 193691 '0-17 Age
## 2 212 '0-4 Age
## 3 125097 '13-17 Age
## 4 259784 '18-24 Age
## 5 259784 '18-24 Age
## 6 544806 '25-34 Age
## 7 544806 '25-34 Age
## 8 502207 '35-44 Age
## 9 502207 '35-44 Age
## 10 517086 '45-54 Age
## 11 517086 '45-54 Age
## 12 68382 '5-12 Age
## 13 582544 '55-64 Age
## 14 582544 '55-64 Age
## 15 465590 '65-74 Age
## 16 465590 '65-74 Age
## 17 224665 '75-84 Age
## 18 224665 '75-84 Age
## 19 77835 '85+ Age
## 20 77835 '85+ Age
## 21 57155 Asian/NHPI Race/ethnicity
## 22 57097 Asian/NHPI Race/ethnicity
## 23 36721 Asian/NHPI Race/ethnicity
## 24 730049 Asian/NHPI Race/ethnicity
## 25 787204 Asian/NHPI Race/ethnicity
## 26 23527 Black Race/ethnicity
## 27 23511 Black Race/ethnicity
## 28 16648 Black Race/ethnicity
## 29 493271 Black Race/ethnicity
## 30 516798 Black Race/ethnicity
## 31 27368 Bronx Borough
## 32 27363 Bronx Borough
## 33 19030 Bronx Borough
## 34 423623 Bronx Borough
## 35 450991 Bronx Borough
## 36 54083 Brooklyn Borough
## 37 53995 Brooklyn Borough
## 38 32516 Brooklyn Borough
## 39 865300 Brooklyn Borough
## 40 919383 Brooklyn Borough
## 41 193691 Citywide Borough
## 42 193479 Citywide Borough
## 43 125097 Citywide Borough
## 44 3174517 Citywide Borough
## 45 3368208 Citywide Borough
## 46 99443 Female Sex
## 47 99333 Female Sex
## 48 65584 Female Sex
## 49 1746356 Female Sex
## 50 1845799 Female Sex
## 51 48988 Hispanic/Latino Race/ethnicity
## 52 48969 Hispanic/Latino Race/ethnicity
## 53 34352 Hispanic/Latino Race/ethnicity
## 54 687591 Hispanic/Latino Race/ethnicity
## 55 736579 Hispanic/Latino Race/ethnicity
## 56 93893 Male Sex
## 57 93791 Male Sex
## 58 59234 Male Sex
## 59 1413385 Male Sex
## 60 1507278 Male Sex
## 61 48284 Manhattan Borough
## 62 48221 Manhattan Borough
## 63 27337 Manhattan Borough
## 64 777827 Manhattan Borough
## 65 826111 Manhattan Borough
## 66 660 Multiracial Race/ethnicity
## 67 656 Multiracial Race/ethnicity
## 68 80 Multiracial Race/ethnicity
## 69 4695 Multiracial Race/ethnicity
## 70 5355 Multiracial Race/ethnicity
## 71 951 Native American/Alaska Native Race/ethnicity
## 72 951 Native American/Alaska Native Race/ethnicity
## 73 764 Native American/Alaska Native Race/ethnicity
## 74 12164 Native American/Alaska Native Race/ethnicity
## 75 13115 Native American/Alaska Native Race/ethnicity
## 76 3478 Other Race/ethnicity
## 77 32 Other Sex
## 78 3476 Other Race/ethnicity
## 79 32 Other Sex
## 80 2728 Other Race/ethnicity
## 81 27 Other Sex
## 82 91673 Other Race/ethnicity
## 83 853 Other Sex
## 84 95151 Other Race/ethnicity
## 85 885 Other Sex
## 86 158 Prefer not to answer Sex
## 87 158 Prefer not to answer Sex
## 88 112 Prefer not to answer Sex
## 89 4272 Prefer not to answer Sex
## 90 4430 Prefer not to answer Sex
## 91 55198 Queens Borough
## 92 55147 Queens Borough
## 93 39604 Queens Borough
## 94 945978 Queens Borough
## 95 1001176 Queens Borough
## 96 8758 Staten Island Borough
## 97 8753 Staten Island Borough
## 98 6610 Staten Island Borough
## 99 161789 Staten Island Borough
## 100 170547 Staten Island Borough
## 101 5892 Unknown Race/ethnicity
## 102 165 Unknown Sex
## 103 5886 Unknown Race/ethnicity
## 104 165 Unknown Sex
## 105 4161 Unknown Race/ethnicity
## 106 140 Unknown Sex
## 107 105777 Unknown Race/ethnicity
## 108 9651 Unknown Sex
## 109 111669 Unknown Race/ethnicity
## 110 9816 Unknown Sex
## 111 53040 White Race/ethnicity
## 112 52933 White Race/ethnicity
## 113 29643 White Race/ethnicity
## 114 1049297 White Race/ethnicity
## 115 1102337 White Race/ethnicity
## POP_DENOMINATOR
## 1 1704122.18
## 2 523718.00
## 3 432845.48
## 4 704670.82
## 5 704670.82
## 6 1483699.00
## 7 1483699.00
## 8 1136906.00
## 9 1136906.00
## 10 1028087.00
## 11 1028087.00
## 12 747558.70
## 13 998927.00
## 14 998927.00
## 15 718795.00
## 16 718795.00
## 17 382672.00
## 18 382672.00
## 19 178938.00
## 20 178938.00
## 21 215999.98
## 22 148927.98
## 23 54811.75
## 24 1017642.02
## 25 1233642.00
## 26 373002.78
## 27 266476.78
## 28 104094.26
## 29 1452845.22
## 30 1825848.00
## 31 347020.43
## 32 246821.43
## 33 92110.57
## 34 1071186.57
## 35 1418207.00
## 36 575063.22
## 37 392137.22
## 38 140161.79
## 39 1984839.78
## 40 2559903.00
## 41 1704122.18
## 42 1180404.18
## 43 432845.48
## 44 6632694.82
## 45 8336817.00
## 46 834086.34
## 47 578537.34
## 48 212834.37
## 49 3524291.66
## 50 4358378.00
## 51 599599.62
## 52 423992.62
## 53 155999.08
## 54 1823990.38
## 55 2423590.00
## 56 870035.84
## 57 601866.84
## 58 220011.10
## 59 3108403.16
## 60 3978439.00
## 61 231257.76
## 62 155113.76
## 63 55915.07
## 64 1397448.24
## 65 1628706.00
## 66 56318.27
## 67 35031.27
## 68 9790.69
## 69 96578.73
## 70 152897.00
## 71 3843.07
## 72 3157.07
## 73 1562.20
## 74 15020.93
## 75 18864.00
## 76 NA
## 77 NA
## 78 NA
## 79 NA
## 80 NA
## 81 NA
## 82 NA
## 83 NA
## 84 NA
## 85 NA
## 86 NA
## 87 NA
## 88 NA
## 89 NA
## 90 NA
## 91 447805.83
## 92 310710.83
## 93 114792.04
## 94 1806052.17
## 95 2253858.00
## 96 102974.94
## 97 75620.94
## 98 29866.00
## 99 373168.06
## 100 476143.00
## 101 NA
## 102 NA
## 103 NA
## 104 NA
## 105 NA
## 106 NA
## 107 NA
## 108 NA
## 109 NA
## 110 NA
## 111 455358.46
## 112 302818.46
## 113 106587.51
## 114 2226617.54
## 115 2681976.00
dummy_data <- sorted_dataset %>%
dummy_cols(c("GROUP", "SUBGROUP")) %>%
select(-c(GROUP, SUBGROUP))
columns_to_remove <- c(
"SUBGROUP_Unknown",
"SUBGROUP_Other",
"SUBGROUP_Prefer not to answer",
"POP_DENOMINATOR",
"SUBGROUP_'0-17"
)
dummy_data <- dummy_data %>%
select(-c(`SUBGROUP_Unknown`, `SUBGROUP_Other`, `SUBGROUP_Prefer not to answer`,`POP_DENOMINATOR`,`SUBGROUP_'0-17`))
column_name_mapping <- c(
"GROUP_Race/ethnicity" = "GROUP_Race_ethnicity",
"SUBGROUP_'0-4" = "Age_0_4",
"SUBGROUP_'5-12" = "Age_5_12",
"SUBGROUP_'13-17" = "Age_13_17",
"SUBGROUP_'18-24" = "Age_18_24",
"SUBGROUP_'25-34" = "Age_25_34",
"SUBGROUP_'35-44" = "Age_35_44",
"SUBGROUP_'45-54" = "Age_45_54",
"SUBGROUP_'55-64" = "Age_55_64",
"SUBGROUP_'65-74" = "Age_65_74",
"SUBGROUP_'75-84" = "Age_75_84",
"SUBGROUP_'85+" = "Age_85_plus",
"SUBGROUP_Asian/NHPI" = "Race_Asian_NHPI",
"SUBGROUP_Hispanic/Latino" = "Race_Hispanic_Latino",
"SUBGROUP_Native American/Alaska Native" = "Race_Native_American_Alaska_Native",
"SUBGROUP_Staten Island" = "Race_Staten_Island",
"SUBGROUP_Black" = "Race_Black",
"SUBGROUP_White" = "Race_White",
"SUBGROUP_Multiracial" = "Race_Multiracial",
"SUBGROUP_Female" ="Gender_F",
"SUBGROUP_Male" = "Gender_M"
)
dummy_data <- na.omit(dummy_data)
colnames(dummy_data) <- sapply(colnames(dummy_data), function(x) {
if (x %in% names(column_name_mapping)) {
return(column_name_mapping[x])
} else {
return(x)
}
})
model <- lm(COUNT_ADDITIONAL_CUMULATIVE ~ ., data = dummy_data)
summary(model)
##
## Call:
## lm(formula = COUNT_ADDITIONAL_CUMULATIVE ~ ., data = dummy_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1285901 -191232 -1207 7674 1957210
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2060 132427 0.016 0.98763
## GROUP_Age 191631 529707 0.362 0.71840
## GROUP_Borough 69232 264854 0.261 0.79440
## GROUP_Race_ethnicity 40929 209385 0.195 0.84548
## GROUP_Sex NA NA NA NA
## Age_0_4 -193479 725331 -0.267 0.79030
## Age_5_12 -125309 725331 -0.173 0.86324
## Age_13_17 -68594 725331 -0.095 0.92487
## Age_18_24 66093 628155 0.105 0.91645
## Age_25_34 351115 628155 0.559 0.57762
## Age_35_44 308516 628155 0.491 0.62456
## Age_45_54 323395 628155 0.515 0.60798
## Age_55_64 388853 628155 0.619 0.53751
## Age_65_74 271899 628155 0.433 0.66619
## Age_75_84 30974 628155 0.049 0.96079
## Age_85_plus -115856 628155 -0.184 0.85410
## Race_Asian_NHPI 290656 280920 1.035 0.30370
## Race_Black 171762 280920 0.611 0.54251
## SUBGROUP_Bronx 118384 324378 0.365 0.71603
## SUBGROUP_Brooklyn 313764 324378 0.967 0.33609
## SUBGROUP_Citywide 1339707 324378 4.130 8.3e-05 ***
## Gender_F 769243 264854 2.904 0.00466 **
## Race_Hispanic_Latino 268307 280920 0.955 0.34217
## Gender_M 631456 264854 2.384 0.01929 *
## SUBGROUP_Manhattan 274265 324378 0.846 0.40015
## Race_Multiracial -40700 280920 -0.145 0.88514
## Race_Native_American_Alaska_Native -37400 280920 -0.133 0.89439
## SUBGROUP_Queens 348129 324378 1.073 0.28614
## Race_Staten_Island NA NA NA NA
## Race_White 414461 280920 1.475 0.14372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 512900 on 87 degrees of freedom
## Multiple R-squared: 0.3463, Adjusted R-squared: 0.1434
## F-statistic: 1.707 on 27 and 87 DF, p-value: 0.03311
colnames(dummy_data) <- gsub("^SUBGROUP_", "", colnames(dummy_data))
rf_model <- randomForest(COUNT_ADDITIONAL_CUMULATIVE ~ ., data = dummy_data)
importance_data <- as.data.frame(importance(rf_model))
importance_data <- data.frame(
Feature = row.names(importance_data),
Importance = importance_data$IncNodePurity
)
importance_data_ordered <- importance_data[order(-importance_data$Importance), ]
print(importance_data)
## Feature Importance
## 1 GROUP_Age 2.546310e+11
## 2 GROUP_Borough 6.857794e+11
## 3 GROUP_Race_ethnicity 3.457317e+11
## 4 GROUP_Sex 4.254633e+11
## 5 Age_0_4 5.547221e+10
## 6 Age_5_12 3.569457e+10
## 7 Age_13_17 1.703853e+10
## 8 Age_18_24 1.625324e+10
## 9 Age_25_34 1.304611e+11
## 10 Age_35_44 1.052718e+11
## 11 Age_45_54 1.076199e+11
## 12 Age_55_64 1.703212e+11
## 13 Age_65_74 7.106815e+10
## 14 Age_75_84 1.851230e+10
## 15 Age_85_plus 7.287943e+10
## 16 Race_Asian_NHPI 2.125262e+11
## 17 Race_Black 7.130117e+10
## 18 Bronx 1.358951e+11
## 19 Brooklyn 2.459527e+11
## 20 Citywide 6.598033e+12
## 21 Gender_F 1.849997e+12
## 22 Race_Hispanic_Latino 1.865358e+11
## 23 Gender_M 1.089288e+12
## 24 Manhattan 1.559977e+11
## 25 Race_Multiracial 1.302214e+11
## 26 Race_Native_American_Alaska_Native 1.142840e+11
## 27 Queens 2.761417e+11
## 28 Race_Staten_Island 2.374544e+11
## 29 Race_White 5.574445e+11
print(importance_data_ordered)
## Feature Importance
## 20 Citywide 6.598033e+12
## 21 Gender_F 1.849997e+12
## 23 Gender_M 1.089288e+12
## 2 GROUP_Borough 6.857794e+11
## 29 Race_White 5.574445e+11
## 4 GROUP_Sex 4.254633e+11
## 3 GROUP_Race_ethnicity 3.457317e+11
## 27 Queens 2.761417e+11
## 1 GROUP_Age 2.546310e+11
## 19 Brooklyn 2.459527e+11
## 28 Race_Staten_Island 2.374544e+11
## 16 Race_Asian_NHPI 2.125262e+11
## 22 Race_Hispanic_Latino 1.865358e+11
## 12 Age_55_64 1.703212e+11
## 24 Manhattan 1.559977e+11
## 18 Bronx 1.358951e+11
## 9 Age_25_34 1.304611e+11
## 25 Race_Multiracial 1.302214e+11
## 26 Race_Native_American_Alaska_Native 1.142840e+11
## 11 Age_45_54 1.076199e+11
## 10 Age_35_44 1.052718e+11
## 15 Age_85_plus 7.287943e+10
## 17 Race_Black 7.130117e+10
## 13 Age_65_74 7.106815e+10
## 5 Age_0_4 5.547221e+10
## 6 Age_5_12 3.569457e+10
## 14 Age_75_84 1.851230e+10
## 7 Age_13_17 1.703853e+10
## 8 Age_18_24 1.625324e+10
age_colors <- c("Age_0_4" = "red","Age_5_12" = "red",
"Age_13_17" = "red", "Age_18_24" = "red", "Age_25_34" = "red",
"Age_35_44" = "red", "Age_45_54" = "red", "Age_55_64" = "red",
"Age_65_74" = "red", "Age_75_84" = "red", "Age_85_plus" = "red")
race_colors <- c("Race_Asian_NHPI" = "green", "Race_Black" = "green",
"Race_Multiracial" = "green", "Race_Native_American_Alaska_Native" = "green",
"Race_Hispanic_Latino" = "green", "Race_White" = "green","Race_Staten_Island" = "green")
borough_colors <- c("Bronx" = "orange", "Brooklyn" = "orange",
"Citywide" = "orange", "Manhattan" = "orange", "Queens" = "orange")
gender_colors <- c("Gender_F" = "pink", "Gender_M" = "pink")
ggplot(importance_data, aes(x = Feature, y = Importance, fill = Feature)) +
geom_bar(stat = "identity", width = 0.7) +
labs(x = "Feature", y = "Importance") +
ggtitle("Feature Importance with Booster") +
scale_fill_manual(values = c(age_colors, race_colors, borough_colors, gender_colors)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 4, vjust = 0.5)) +
theme(axis.text.y = element_text(angle = 0, hjust = 0.5, size = 4, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.2, size = 16, face = "bold")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
coord_flip() +
scale_y_continuous(labels = scales::comma) +
theme(
plot.background = element_rect(fill = "white"),
plot.title.position = "plot",
plot.title = element_text(hjust = 0.5, size = 20, face = "bold"),
legend.position = "bottom"
) +
labs(fill = "Feature") +
theme(
legend.text = element_text(size = 4),
legend.title = element_text(size = 4, face = "bold")
) +
theme(legend.box = "horizontal")
#install.packages(c("tm", "stringr", "textclean"))
#install.packages("wordcloud")
#install.packages("plotly")
#install.packages("textdata")
#install.packages("afinn")
#install.packages("tidytext")
#install.packages("treemap")
library(readr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringr)
library(textclean)
library(readr)
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(dplyr)
library(treemap)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
covid <- utils::read.csv("train.csv", sep = "|", stringsAsFactors = FALSE)
colnames(covid)
## [1] "Text" "Label"
age_pattern <- "\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35)\\b|\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35) years? old\\b|\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35) y/o\\b|\\b(18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35) yo\\b"
#age_pattern <- "\\bage\\b|\\byears? old\\b|\\by/o\\b|\\byo\\b|aged"
covid_pattern <- "\\bcovid\\b|\\bcoronavirus\\b"
vaccine_pattern <- "\\bvaccine\\b|\\bvaccination\\b|\\bimmunization\\b"
race_pattern <- "\\bWhite\\b|\\bBlack\\b|\\bAfrican American\\b|\\bAsian\\b|\\bHispanic\\b|\\bLatino\\b"
usa_pattern <- "\\bUnited States\\b|\\bUSA\\b|\\bAmerica\\b|\\bAmerican\\b"
filtered_data <- covid[with(covid, str_detect(Text, age_pattern) |
str_detect(Text, covid_pattern) |
str_detect(Text, vaccine_pattern) |
str_detect(Text, race_pattern) |
str_detect(Text, usa_pattern)), ]
head(filtered_data)
## Text
## 3 I do agree with you but I also think USA has a few differences that will make things worse here1 general poor health if people Many comorbidities like hbp diabetes coronary disease are extremely common and considered normal for anyone over 402 number of hospital beds and especially icu beds per capital are much lower than sk3 access to healthcare is much more difficult here than in sk Many people will not be able to afford it or afford time off work etc making them avoid treatment until its to late or until theyve continued spreading4 we simply arent testing which increases community spread This wont effect things in a percentage way but will increase infected and death totals
## 47 I dont know how you got to it you seem a little confused about it They are referring to this studyhttpswwwpharmaceuticaltechnologycomnewscoronavirusincubationperiod27days which is the only one to come up with 27 days for a possible incubation period All of the other studies done have concluded that the incubation period is 14 days or less 27 is the outlier
## 64 2 of 50k is 10001000 restraunts employing lets say 12 people each thats 12k 12k people lets assume they aall get min wage thats 12k72587k an hour Thats 348 million a week for however many weeks Because each of the 12 only works say 40 hours a week 87k40The ceo of Yum made 22 million in 2013 sorry most recent I could find 348m is 158 of 22m Theyre paying 15 a week of the ceos salary for that Thats not exactly chump change For perspective thats over 5 what god demands be given to the church
## 102 For those who cant see the post I copied it in two parts in the comments below see Part 2 in another commenthttpswwwredditcomrCoronaviruscommentsffa2tftestimonyofasurgeonworkinginbergamointhehttpswwwredditcomrCoronaviruscommentsffa2tftestimonyofasurgeonworkinginbergamointhePART 1throwaway1Dgrzqhttpswwwredditcomuserthrowaway1Dgrzq 515 points15 hours agohttpswwwredditcomrCoronaviruscommentsffa2tftestimonyofasurgeonworkinginbergamointhefjx5tc8edited 14 hours agolate edit I need to add a disclaimer dont go to the linked thread if you have any sort of anxiety issues that other sub is extremely grim and not supportive of mental health I apologize for the late warningEnglish translation by mcgianni on rmedicinehttpsnpredditcomrmedicinecommentsff8hnstestimonyofasurgeonworkinginbergamointhe NPlinked please do not participate if you are a nonmedical professionalIn one of the nonstop emails that I receive from my hospital administration on a more than daily basis there was a paragraph on how to be responsible on social media with some recommendations that we all can agree on After thinking for a long time if and what to write about whats happening here I felt that silence was not responsible I will therefore try to convey to laypeople those who are more distant from our reality what we are experiencing in Bergamo during these Covid19 pandemic days I understand the need not to panic but when the message of the danger of what is happening is not out and I still see people ignoring the recommendations and people who gather together complaining that they cannot go to the gym or play soccer tournaments I shiver I also understand the economic damage and I am also worried about that After this epidemic it will be hard to start over Still beside the fact that we are also devastating our national health system from an economic point of view I want to point out that the public health damage that is going to invest the country is more important and I find it nothing short of chilling that new quarantine areas requested by the Region has not yet been established for the municipalities of Alzano Lombardo and Nembro I would like to clarify that this is purely personal opinion I myself looked with some amazement at the reorganization of the entire hospital in the previous week when our current enemy was still in the shadows the wards slowly emptied elective activities interrupted intensive care unit freed to create as many beds as possible Containers arriving in front of the emergency room to create diversified routes and avoid infections All this rapid transformation brought in the hallways of the hospital an atmosphere of surreal silence and emptiness that we did not understand waiting for a war that had yet to begin and that many including me were not so sure would never come with such ferocity I open a parenthesis all this was done in the shadows and without publicity while several newspapers had the courage to say that private health care was not doing anything I still remember my night shift a week ago spent without any rest waiting for a call from the microbiology department I was waiting for the results of a swab taken from the first suspect case in our hospital thinking about what consequences it would have for us and the hospital If I think about it my agitation for one possible case seems almost ridiculous and unjustified now that I have seen what is happening Well the situation is now nothing short of dramatic No other words come to mind The war has literally exploded and battles are uninterrupted day and night One after the other these unfortunate people come to the emergency room They have far from the complications of a flu Lets stop saying its a bad flu In my two years working in Bergamo I have learned that the people here do not come to the emergency room for no reason They did well this time too They followed all the recommendations given a week or ten days at home with a fever without going out to prevent contagion but now they cant take it anymore They dont breathe enough they need oxygen Drug therapies for this virus are few The course mainly depends on our organism We can only support it when it cant take it anymore It is mainly hoped that our body will eradicate the virus on its own lets face it Antiviral therapies are experimental on this virus and we learn its behavior day after day Staying at home until the symptoms worsen does not change the prognosis of the disease Now however that need for beds in all its drama has arrived One after another the departments that had been emptied are filling up at an impressive rate The display boards with the names of the sicks of different colors depending on the department they belong to are now all red and instead of the surgical procedure there is the diagnosis which is always the same bilateral interstitial pneumonia Now tell me which flu virus causes such a rapid tragedy Because thats the difference now I get a little technical in classical flu besides that it infects much less population over several months cases are complicated less frequently only when the virus has destroyed the protective barriers of our airways and as such it allows bacteria which normally resident in the upper airways to invade the bronchi and lungs causing a more serious disease Covid 19 causes a banal flu in many young people but in many elderly people and not only a real SARS because it invades the alveoli of the lungs directly and it infects them making them unable to perform their function The resulting respiratory failure is often serious and after a few days of hospitalization the simple oxygen that can be administered in a ward may not be enough Sorry but to me as a doctor its not reassuring that the most serious are mainly elderly people with other pathologies The elderly population is the most represented in our country and it is difficult to find someone who above 65 years of age does not take at least a pill for high blood pressure or diabetes I can also assure you that when you see young people who end up intubated in the ICU pronated or worse in ECMO a machine for the worst cases which extracts the blood reoxygenates it and returns it to the body waiting for the lungs to hopefully heal all this confidence for your young age goes away And while there are still people on social media who boast of not being afraid by ignoring the recommendations protesting that their normal lifestyle habits have temporarily halted the epidemiological disaster is taking place And there are no more surgeons urologists orthopedists we are only doctors who suddenly become part of a single team to face this tsunami that has overwhelmed us12See my other comment for PART 2
## 104 It was just a stupid snobby thing to say by the elites and the expertsIts not a coincidence that East Asian countries have had so much success When there is so much asymptomatic transmission its better of if we tell people to wear it not that its useless
## 132 Government confirmation press release 01282020 No 12 GP Download PDFhttpswwwstmgpbayerndepressedreiweiterecoronavirusfaelleinbayernzusammenhangmitdemerstenfallbayernsoutputpdfThree more coronavirus cases in Bavaria connection with the first case Bavarias Minister of Health Huml around 40 people should be tested on Wednesday as a precaution The Bavarian Ministry of Health was informed on Tuesday evening that three other people in Bavaria had been infected with the novel corona virus These patients are also employees of the company from the district of Starnberg where the first person affected is already employed This man had apparently contracted a Chinese colleague during a training session on January 21 This colleague flew back to China on January 23 On January 27 the company informed the health department of the Chinese womans illness It was decided that the three new patients should also be admitted to the Munich Clinic Schwabing where they would be medically monitored and isolated A few other contact persons are currently testing whether they are infected with the corona virus The Bavarian Ministry of Health and the State Office for Health and Food Safety LGL will report on details in a press release on Wednesday Bavarias Minister of Health Melanie Huml emphasized on Tuesday evening in Munich A total of around 40 employees of the company were identified who could be considered as close contacts Those affected should be tested on Wednesday as a precautionary measure They will also be examined by the LGLs Task Force Infectiology questioned in detail
## Label
## 3 0
## 47 0
## 64 1
## 102 0
## 104 1
## 132 0
corpus <- Corpus(VectorSource(filtered_data$Text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
tokenize <- function(x) {
unlist(str_split(x, pattern = " "))
}
corpus <- tm_map(corpus, content_transformer(tokenize))
sentiment_data <- data.frame(text = sapply(corpus, as.character))
text_combined <- paste(unlist(corpus), collapse = " ")
wordcloud_obj <- wordcloud(text_combined, max.words = 100, scale = c(5, 0.8), colors = brewer.pal(8, "Dark2"))
sentiment_data <- sentiment_data %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(word) %>%
summarise(sentiment_score = sum(value))
## Joining with `by = join_by(word)`
positive_words <- sentiment_data %>%
filter(sentiment_score >= 0.5) %>%
top_n(20, wt = sentiment_score)
negative_words <- sentiment_data %>%
filter(sentiment_score <= 0.5) %>%
top_n(20, wt = sentiment_score)
head(negative_words)
## # A tibble: 6 × 2
## word sentiment_score
## <chr> <dbl>
## 1 apologise -1
## 2 apologized -1
## 3 awaits -1
## 4 axe -1
## 5 cancels -1
## 6 clouded -1
head(positive_words)
## # A tibble: 6 × 2
## word sentiment_score
## <chr> <dbl>
## 1 best 918
## 2 better 990
## 3 care 1092
## 4 focused 352
## 5 good 1953
## 6 great 645
positive_words <- positive_words[order(-positive_words$sentiment_score), ]
positive_words
## # A tibble: 20 × 2
## word sentiment_score
## <chr> <dbl>
## 1 like 3968
## 2 good 1953
## 3 care 1092
## 4 better 990
## 5 best 918
## 6 help 694
## 7 positive 652
## 8 great 645
## 9 please 581
## 10 hope 532
## 11 support 516
## 12 want 478
## 13 healthy 390
## 14 true 390
## 15 united 365
## 16 welcome 356
## 17 focused 352
## 18 lol 351
## 19 important 332
## 20 novel 322
fig <- plot_ly(
labels = positive_words$word,
parents = "",
values = positive_words$sentiment_score,
type = "treemap",
path = ~word,
ids = ~word,
textinfo = "label+value"
)
fig <- fig %>% layout(
title = "20 Most Positive Words, Age 15-34, All Race",
font = list(size = 14),
margin = list(l = 0, r = 0, b = 0, t = 40)
)
fig
fig <- plot_ly(
labels = negative_words$word,
parents = "",
values = abs(negative_words$sentiment_score),
type = "treemap"
)
fig <- fig %>% layout(
title = "20 Most Negative Words",
font = list(size = 14),
margin = list(l = 0, r = 0, b = 0, t = 40)
)
fig
total_positive_score <- sum(positive_words$sentiment_score)
total_negative_score <- sum(negative_words$sentiment_score)
threshold_value <- 5
if (total_positive_score - total_negative_score > threshold_value) {
overall_sentiment <- "Positive"
} else if (total_positive_score - total_negative_score < threshold_value) {
overall_sentiment <- "Negative"
} else {
overall_sentiment <- "Neutral"
}
cat("Overall Sentiment:", overall_sentiment, "\n")
## Overall Sentiment: Positive